'''
One reviewer challenged us on the claim that geotagged protest images do not differ from non-geotagged images.  This script investigates that two ways.
	1. Compare Charlottesville images w/ geotag to those w/o.
	2. Compare images w/ country geotag to those without.
'''
##########################
##
##	GLOBALS
##
##########################
set.seed(01282017)
#setwd('/path/to/replication')

library(ggplot2)
library(ggstance)  # For better dodging
library(Hmisc) # To capitalize first letter
library(data.table)
library(dplyr)

options(scipen=10000)  # no scientific notation

##########################
##
##	THIS PROJECT
##
##########################
tweets <- read.csv('./Data/02_processedData/c_DonghyeonAlexmerged_Newclassifiers_ShortSpain.csv')


### What if just use place type from Twitter?
tweets$place.place_type <- gsub('b\'admin\'', 'admin', tweets$place.place_type)
tweets$place.place_type <- gsub('b\'city\'', 'city', tweets$place.place_type)
tweets$place.place_type <- gsub('b\'country\'', 'country', tweets$place.place_type)
tweets$place.place_type <- gsub('b\'poi\'', 'neighborhood', tweets$place.place_type) # There is only one poi, so just make it neighborhood
tweets$place.place_type <- capitalize(tweets$place.place_type)

### Countries in paper
these <- tweets[grepl(paste('b\'HK\'', 'ES', 'KR', 'PK', 'VE', sep="|"), tweets$place.country_code),]
these$place.country_code <- gsub('b\'HK\'', 'HK', these$place.country_code)

### Twitter assigns Hong Kong only country, admin levels but inspection shows they are really districts, so I will call them neighborhood
these$place.place_type[these$place.country_code=='HK'] <- 'Neighborhood'


### Change factor order
these$place.place_type <- factor(these$place.place_type, levels=c('Country', 'Admin', 'City', 'Neighborhood'))


# Plot protester violence
ggplot(these, aes(x=protest_result.protester_violence, y=place.place_type)) + geom_point(alpha=.1, size=.5) + scale_y_discrete(name="", limits=rev(levels(these$place.place_type))) + facet_grid(rows=vars(these$place.country_code)) + xlab("") + theme_classic() + theme(text = element_text(size=16)) + stat_summary(color='forestgreen') + coord_cartesian(xlim=c(0, 1))
ggsave(plot=last_plot(), filename='./Figures/biasByAgg_protesterviolence_wSummary.jpg', width=5, height=4, units='in')

# Plot state violence
ggplot(these, aes(x=protest_result.state_violence, y=place.place_type)) + geom_point(alpha=.1, size=.5) + scale_y_discrete(name="", limits=rev(levels(these$place.place_type))) + facet_grid(rows=vars(these$place.country_code)) + xlab("") + theme_classic() + theme(text = element_text(size=16)) + stat_summary(color='forestgreen')
ggsave(plot=last_plot(), filename='./Figures/biasByAgg_stateviolence_wSummary.jpg', width=5, height=4, units='in')

# Plot faces per photo
ggplot(these, aes(x=totalFaces, y=place.place_type)) + geom_point(alpha=.1, size=.5) + scale_y_discrete(name="", limits=rev(levels(these$place.place_type))) + facet_grid(rows=vars(these$place.country_code)) + xlab("") + theme_classic() + theme(text = element_text(size=16)) + stat_summary(color='forestgreen')
ggsave(plot=last_plot(), filename='./Figures/biasByAgg_totalFaces_wSummary.jpg', width=5, height=4, units='in')


##########################
##
##	CHARLOTTESVILLE
##
##########################
#### ORIGINAL DATA
cville <- fread('/.Data/Charlottesville/5_finalData_v2.csv')

####
# LOAD SCENE CLASSIFIER RESULTS
####
these <- list.files(path='./Data/Charlottesville/ConvNeurNet_Results', full.names=TRUE)
these <- these[grep('scene2', these)]

## Make one dataset
data <- NULL
for(file in these){
	temp <- fread(file)
	print(nrow(temp))
	data <- rbind(temp, data)
}

## Make tweet id
# I have no idea why it is so complicated to get the id from the path, but oh well.
dothis <- function(x){
	temp <- strsplit(x, '/')
	temp2 <- lapply(temp, function(x) unlist(x)[length(x)])  # get last entry
	temp2 <- unlist(temp2)
	temp3 <- strsplit(temp2, '\\.')
	temp3 <- lapply(temp3, function(x) unlist(x)[1])
	temp3 <- unlist(temp3)

	return(temp3)
}

data$tweet_id <- dothis(data$imgpath)
data$tweet_id <- as.numeric(data$tweet_id)

####
# MERGE
####
df <- merge(cville, data, by.x='tweet_id', by.y='tweet_id', all.x=TRUE, all.y=FALSE)

# Keep only those with ratings for protest, state violence, protester violence
df <- df[is.na(df$protest) == FALSE,]

# Keep only those of protest, based on threshold for paper
df <- df[df$protest >= .6]

####
# PLOT, GPS TYPE VS NO AND SELF LOCATION
####

## Split blank into 2 categories
temp <- df[df$place.place_type=='Blank',]
split1 <- temp[temp$user.location != '',]
split2 <- temp[temp$user.location == '',]

split1$place.place_type <- 'Profile Location'
split2$place.place_type <- 'No Location'

temp <- rbind(split1, split2)

## Merge back in
df[df$place.place_type=='Blank',] <- temp


toplot <- df %>% group_by(place.place_type) %>% sample_n(keep, replace=TRUE)
toplot_melt <- melt(toplot, measure.vars=c('protest', 'state_violence', 'protester_violence'), id.vars=c('place.place_type'))

# Beautify
toplot_melt$variable <- as.character(toplot_melt$variable)
toplot_melt$variable[toplot_melt$variable == 'protest'] <- 'Protest'
toplot_melt$variable[toplot_melt$variable == 'state_violence'] <- 'State Violence'
toplot_melt$variable[toplot_melt$variable == 'protester_violence'] <- 'Protester Violence'
names(toplot_melt)[names(toplot_melt) == 'place.place_type'] <- 'Location'
toplot_melt$Location <- as.character(toplot_melt$Location)

toplot_melt$Location[toplot_melt$Location == 'country'] <- 'Country'
toplot_melt$Location[toplot_melt$Location == 'neighborhood'] <- 'Neighborhood'
toplot_melt$Location[toplot_melt$Location == 'city'] <- 'City'
toplot_melt$Location[toplot_melt$Location == 'admin'] <- 'Admin'


ggplot(toplot_melt, aes(x=value, y=Location)) + geom_point(alpha=.1, size=.5) + facet_grid(row=vars(as.factor(variable))) + theme_classic() + xlab('') + ylab('') + stat_summary(fun.data=mean_cl_normal, color='forestgreen', size=1) + theme(text = element_text(size=16)) + scale_y_discrete(name="", limits=rev(c('No Location', 'Profile Location', 'Country', 'Admin', 'City', 'Neighborhood')))
ggsave(filename='./Figures/biasByAgg_cville3.jpg')

